1 Setup

library(tidyverse)
library(here)
library(readr)
library(readxl)
library(data.table)

1.1 Loading in cleaned data

candy <- read_csv(here("clean_data/candy_clean.csv"),
# specifying data types
col_types = cols(gender = col_character(), country = col_character()))

2 Data

2.1 Raw

Three seperate Excel files with different formatting

# 2015
"raw_data/boing-boing-candy-2015.xlsx" %>%
    here() %>%
    read_excel() %>% 
    data.table()
# 2016
"raw_data/boing-boing-candy-2016.xlsx" %>%
    here() %>%
    read_excel() %>% 
    data.table()
# 2017
"raw_data/boing-boing-candy-2017.xlsx" %>%
    here() %>%
    read_excel() %>% 
    data.table()

2.2 Clean data

candy %>% 
    data.table()

3 Questions

3.1 Number of ratings

What is the total number of candy ratings given across the three years?

candy %>%
# filtering out NAs
  filter(!is.na(rating)) %>%
  nrow()
## [1] 757108

3.2 Halloween

What was the average age of people who are going out trick or treating and the average age of people not going trick or treating?

candy %>%
  group_by(going_out) %>%
# Rounding to whole years
  summarise(avg_age = round(mean(age, na.rm = TRUE))) %>%
# Yes first
    arrange(desc(going_out))
## `summarise()` ungrouping output (override with `.groups` argument)

3.3 Impressions

For each of joy, despair and meh, which candy bar recieved these ratings the most?

candy %>%
# Filtering out NAs
  filter(!is.na(rating)) %>%
  group_by(rating, candy) %>%
  summarise(
    count = n()
  ) %>%
  filter(count == max(count))
## `summarise()` regrouping output by 'rating' (override with `.groups` argument)

3.4 Sad Starbursts

How many people rated Starburst as despair?

candy %>%
  filter(candy == "starburst") %>%
  filter(rating == "despair") %>%
  summarise(id_count = n_distinct(person_id))

For the next two questions, count despair as -1, joy as +1 and meh as 0.

candy <-
  candy %>%
  mutate(rating_numeric = recode(rating,
                                 joy = 1,
                                 despair = -1,
                                 meh = 0))

3.5 Overall favourite

What was the most popular candy bar by this rating system for each gender in the dataset?

candy %>%
  group_by(gender, candy) %>%
  summarise(
    avg_rating = mean(rating_numeric, na.rm = TRUE)
  ) %>%
  filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)

3.6 Candy of the year

What was the most popular candy bar in each year?

candy %>%
  group_by(year, candy) %>%
  summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
  filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)

3.7 Country favourites

What was the most popular candy bar by this rating for people in US, Canada, UK and all other countries?

candy %>%
  mutate(country = if_else(country %in% c("united states",
                                          "canada",
                                          "united kingdom"),
                           country, "other")) %>%
  group_by(country, candy) %>%
  summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
  filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'country' (override with `.groups` argument)